library(ggplot2)
library(phyloseq)
library(randomForest)
library(tidyverse)
library(ggpubr)
library(microbiome)
library(dysbiosisR)
library(ggvenn)
library(NBZIMM)
library(ZIBR)
library(reshape)
library(RColorBrewer)
ps_data <- readRDS("data/ps_data")
#remove Caucasian subjects
ps_data <- subset_samples(ps_data, Race_ID == "African.American")
# Extract sample data as a data frame
sample_df <- as_data_frame(sample_data(ps_data))
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` (with slightly different semantics) to convert to a
## tibble, or `as.data.frame()` to convert to a data frame.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Add Age_Cat2
sample_df <- sample_df %>%
mutate(Age = as.numeric(as.character(Age))) %>%
mutate(Age_Cat2 = case_when(
Age < 40 ~ "Less than 40",
Age >= 40 & Age < 60 ~ "40 to 59",
Age >= 60 ~ "60+"
))
sample_df <- as.data.frame(sample_df)
#Preserve original rownames (sample names)
rownames(sample_df) <- rownames(ps_data@sam_data)
identical(colnames(ps_data@otu_table), rownames(sample_df))
## [1] TRUE
# add back to physeq
sample_data(ps_data) <- sample_data(sample_df)
#convert phylseq object into a dataframe
ps_data.df <- psmelt(ps_data)
#cleanup Genus names
ps_data.df$Genus <- sub("g__", "", ps_data.df$Genus)
#cleanup Phyla names
ps_data.df$Phylum <- sub("p__", "", ps_data.df$Phylum)
#transform counts to relative abundances
ps_rel <- transform_sample_counts(ps_data, function(OTU) OTU/sum(OTU))
#convert the relative abundance data into a dataframe
ps_rel.df <- psmelt(ps_rel)
#cleanup Genus names
ps_rel.df$Genus <- sub("g__", "", ps_rel.df$Genus)
#set the theme for all plots
theme_set(theme_bw())
#a color palette for use in plotting
palette <- c("lightpink", "cornflowerblue", "yellow2", "darkblue", "darkorange1", "deeppink", "darkviolet", "palegreen", "springgreen4", "black", "goldenrod", "red", "blue", "#3DB7E9", "#D55E00", "#F0E442",'#253494', "gray50", "purple", "orange", "lightgreen", "forestgreen")
Explore metadata
df.meta <- as(sample_data(ps_data), "data.frame")
df.meta.unique <- df.meta[!duplicated(df.meta$Person_ID), ]
df.meta.unique %>% count(Hair_Type)
## Hair_Type n
## 1 Color 5
## 2 Keratin.Treatment 1
## 3 Natural 20
## 4 Relaxed 8
## 5 <NA> 2
df.meta.unique %>% count(Race_ID)
## Race_ID n
## 1 African.American 36
df.meta.unique %>% count(Gender)
## Gender n
## 1 Female 29
## 2 Male 7
df.meta.unique %>% count(Cleansing_Freq)
## Cleansing_Freq n
## 1 Biweekly 16
## 2 Daily 1
## 3 Monthly 7
## 4 Weekly 12
range(df.meta.unique$Age)
## [1] 23 73
#check the number of participants
length(unique(df.meta$Person_ID))
## [1] 36
#check the number of samples per participants
df.meta %>%
count(Person_ID) %>%
arrange(desc(n))
## Person_ID n
## 1 Person.02 2
## 2 Person.03 2
## 3 Person.05 2
## 4 Person.07 2
## 5 Person.09 2
## 6 Person.10 2
## 7 Person.11 2
## 8 Person.12 2
## 9 Person.13 2
## 10 Person.14 2
## 11 Person.15 2
## 12 Person.17 2
## 13 Person.18 2
## 14 Person.19 2
## 15 Person.22 2
## 16 Person.23 2
## 17 Person.24 2
## 18 Person.25 2
## 19 Person.26 2
## 20 Person.29 2
## 21 Person.32 2
## 22 Person.33 2
## 23 Person.35 2
## 24 Person.36 2
## 25 Person.37 2
## 26 Person.40 2
## 27 Person.42 2
## 28 Person.44 2
## 29 Person.46 2
## 30 Person.47 2
## 31 Person.49 2
## 32 Person.06 1
## 33 Person.16 1
## 34 Person.27 1
## 35 Person.41 1
## 36 Person.45 1
#check the number of samples per hair site type
df.meta %>%
count(Area_Cond)
## Area_Cond n
## 1 Afflicted 35
## 2 Normal 32
#check the age ranges
range(df.meta$Age)
## [1] 23 73
# Density
ggplot(df.meta.unique, aes(x = Age, y = Density_Scale)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic() +
stat_cor() +
labs(x = "Age (years)",
y = "Scalp Density") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(family = "Arial", size = 12),
axis.text.y = element_text(family = "Arial", size = 12))+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# stress
ggplot(df.meta.unique, aes(x = Age, y = Stress_Scale)) +
geom_point() +
geom_smooth(method = "lm") +
theme_classic() +
stat_cor() +
labs(x = "Age (years)",
y = "Scalp Stress") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(family = "Arial", size = 12),
axis.text.y = element_text(family = "Arial", size = 12))+
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_cor()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

#check the number of samples per participants
df.meta %>%
count(Person_ID, X.SampleID)
## Person_ID X.SampleID n
## 1 Person.02 140GP 1
## 2 Person.02 143GP 1
## 3 Person.03 098GP 1
## 4 Person.03 099GP 1
## 5 Person.05 213GP 1
## 6 Person.05 240GP 1
## 7 Person.06 234GP 1
## 8 Person.07 080GP 1
## 9 Person.07 084GP 1
## 10 Person.09 238GP 1
## 11 Person.09 239GP 1
## 12 Person.10 231GP 1
## 13 Person.10 232GP 1
## 14 Person.11 183GP 1
## 15 Person.11 188GP 1
## 16 Person.12 070GP 1
## 17 Person.12 078GP 1
## 18 Person.13 062GP 1
## 19 Person.13 072GP 1
## 20 Person.14 211GP 1
## 21 Person.14 216GP 1
## 22 Person.15 008GP 1
## 23 Person.15 009GP 1
## 24 Person.16 174GP 1
## 25 Person.17 064GP 1
## 26 Person.17 068GP 1
## 27 Person.18 142GP 1
## 28 Person.18 148GP 1
## 29 Person.19 173GP 1
## 30 Person.19 175GP 1
## 31 Person.22 090GP 1
## 32 Person.22 091GP 1
## 33 Person.23 144GP 1
## 34 Person.23 146GP 1
## 35 Person.24 243GP 1
## 36 Person.24 244GP 1
## 37 Person.25 092GP 1
## 38 Person.25 093GP 1
## 39 Person.26 104GP 1
## 40 Person.26 107GP 1
## 41 Person.27 241GP 1
## 42 Person.29 082GP 1
## 43 Person.29 086GP 1
## 44 Person.32 200GP 1
## 45 Person.32 202GP 1
## 46 Person.33 102GP 1
## 47 Person.33 109GP 1
## 48 Person.35 118GP 1
## 49 Person.35 119GP 1
## 50 Person.36 246GP 1
## 51 Person.36 247GP 1
## 52 Person.37 248GP 1
## 53 Person.37 249GP 1
## 54 Person.40 236GP 1
## 55 Person.40 237GP 1
## 56 Person.41 060GP 1
## 57 Person.42 201GP 1
## 58 Person.42 203GP 1
## 59 Person.44 096GP 1
## 60 Person.44 097GP 1
## 61 Person.45 204GP 1
## 62 Person.46 209GP 1
## 63 Person.46 210GP 1
## 64 Person.47 094GP 1
## 65 Person.47 095GP 1
## 66 Person.49 207GP 1
## 67 Person.49 208GP 1
Explore taxonomy
#total number of sequences
sum(ps_data.df$Abundance)
## [1] 1720130
#percentage of each phylum in descending order
percent_phyla <- ps_data.df %>%
group_by(Phylum) %>%
summarize(sum = sum(Abundance/1720130)*100) %>%
arrange(desc(sum))
percent_phyla
## # A tibble: 38 × 2
## Phylum sum
## <chr> <dbl>
## 1 Firmicutes 50.6
## 2 Actinobacteria 33.0
## 3 Proteobacteria 11.6
## 4 Cyanobacteria 3.15
## 5 Bacteroidetes 1.14
## 6 Fusobacteria 0.283
## 7 Verrucomicrobia 0.0400
## 8 Acidobacteria 0.0292
## 9 <NA> 0.0246
## 10 Planctomycetes 0.0221
## # ℹ 28 more rows
#percentage of each genus in descending order
percent_genera <- ps_data.df %>%
group_by(Genus) %>%
summarize(sum = sum(Abundance/1720130)*100) %>%
arrange(desc(sum))
percent_genera
## # A tibble: 380 × 2
## Genus sum
## <chr> <dbl>
## 1 "Staphylococcus" 41.8
## 2 "Corynebacterium" 26.8
## 3 "" 7.69
## 4 "Propionibacterium" 4.04
## 5 "Acinetobacter" 3.08
## 6 "Streptococcus" 2.80
## 7 "Anaerococcus" 1.77
## 8 "Enhydrobacter" 1.52
## 9 "Burkholderia" 1.18
## 10 <NA> 1.16
## # ℹ 370 more rows
ps_data.df %>%
filter(Genus == c("Staphylococcus", "Corynebacterium", "Propionibacterium")) %>%
group_by(Genus, Area_Cond) %>%
summarize(average = sum(Abundance/1720130)*100)
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups: Genus [3]
## Genus Area_Cond average
## <chr> <chr> <dbl>
## 1 Corynebacterium Afflicted 4.24
## 2 Corynebacterium Normal 5.42
## 3 Propionibacterium Afflicted 0.781
## 4 Propionibacterium Normal 0.513
## 5 Staphylococcus Afflicted 5.88
## 6 Staphylococcus Normal 9.64
ps_data.df %>%
filter(Genus == c("Staphylococcus", "Corynebacterium", "Propionibacterium")) %>%
group_by(Genus, Area_Cond) %>%
summarize(average = sum(Abundance/1720130)*100) %>%
ggplot(aes(Area_Cond, average, fill = Genus))+
geom_bar(stat="identity")+
coord_flip()+
theme_bw()+
scale_fill_viridis_d()+
labs(x = "", y = "Average Relative Abundance (%)")
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.

plot_df <- ps_rel.df %>%
mutate(Genus = ifelse(Abundance < 0.05, " Other < 5%", Genus),
Genus = ifelse(is.na(Genus) | Genus == "", "Unclassified", Genus)) %>%
group_by(Sample, Area_Cond, Genus) %>%
summarize(Abundance = sum(Abundance), .groups = "drop") %>%
group_by(Area_Cond, Genus) %>%
summarize(mean_abundance = mean(Abundance) * 100,
sd_abundance = sd(Abundance) * 100,
.groups = "drop")
ggplot(plot_df, aes(x = Genus, y = mean_abundance, fill = Area_Cond)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9), alpha=0.7) +
geom_errorbar(aes(ymin = mean_abundance - sd_abundance, ymax = mean_abundance + sd_abundance),
width = 0.3, position = position_dodge(width = 0.9)) +
theme_bw() +
labs(x = "", y = "Average Relative Abundance (%)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.title = element_blank()) +
stat_compare_means(
method = "wilcox",
paired = TRUE,
hide.ns = TRUE,
label = "p.signif") +
scale_fill_manual(values = c("red", "black"))+
coord_flip()

ggsave("figures/Supplemental_AverageRelativeAbd_Genus.pdf")
## Saving 7 x 5 in image
Alpha diversity
#generate alpha diversity metrics
alpha <- microbiome::alpha(ps_data, index = "all")
## Observed richness
## Other forms of richness
## Diversity
## Evenness
## Dominance
## Rarity
#assign sample names to the new data
alpha$X.SampleID <- rownames(alpha)
# extract sample data from the phyloseq object
df <- as(sample_data(ps_data), "data.frame")
# Merge Tables
alpha.table <- merge(alpha, df, by = "X.SampleID")
#Shannon diversity
ggplot(alpha.table, aes(x = Area_Cond, y = diversity_shannon, fill = Area_Cond)) +
geom_boxplot() +
stat_compare_means(method="wilcox") +
labs(x = "Skin Site",
y = "Shannon") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
legend.title = element_blank())

#Number of OTUs
ggplot(alpha.table, aes(x = Area_Cond, y = observed, fill = Area_Cond)) +
geom_boxplot() +
stat_compare_means() +
labs(x = "Skin Site",
y = "Oberserved OTUs") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank())

#Calculate average number of OTUs per group
alpha.table %>%
group_by(Area_Cond) %>%
summarise(mean(observed))
## # A tibble: 2 × 2
## Area_Cond `mean(observed)`
## <chr> <dbl>
## 1 Afflicted 363.
## 2 Normal 297.
#Shannon vs Age
ggplot(alpha.table,
aes(x = Age, y = diversity_shannon, fill = Area_Cond, color = Area_Cond)) +
geom_point(aes(color = Area_Cond)) +
geom_smooth(method = "lm", fill = "gray", guide = "none") +
stat_cor() +
labs(x = "Age", y = "Shannon Diversity", title = "A") +
scale_color_manual(values = c("red", "black"))+
guides(color = guide_legend(override.aes = list(shape = 16, size = 3, fill = NA)))+
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.text = element_text(size = 10),
legend.title = element_blank())
## Warning in geom_smooth(method = "lm", fill = "gray", guide = "none"): Ignoring
## unknown parameters: `guide`
## `geom_smooth()` using formula = 'y ~ x'

ggsave("figures/Fig1a.pdf", height = 4, width = 5)
## `geom_smooth()` using formula = 'y ~ x'
alpha.table %>%
mutate(Age_Cat2 = case_when(Age < 40 ~ "1) Less than 40",
Age < 60 & Age > 39 ~ "2) 40 to 59",
Age > 59 ~ "3) 60+")) %>%
ggplot(aes(x = Area_Cond, y = diversity_shannon)) +
geom_boxplot(aes(color = Area_Cond), show.legend = F) +
geom_point(aes(color = Area_Cond))+
stat_compare_means(method="wilcox") +
stat_cor()+
scale_color_manual(values = c("red", "black"))+
facet_wrap(~Age_Cat2)+
guides(color = guide_legend(override.aes = list(shape = 16, size = 3, fill = NA)))+
labs(x = " ", y = "Shannon Diversity", title = "B") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 12),
axis.text.y = element_text(size = 12),
legend.position = "none")

ggsave("figures/Fig1b.pdf", height = 4, width = 5)
alpha.table %>%
filter(Area_Cond == "Afflicted") %>%
ggplot(aes(x = Cleansing_Freq, y = diversity_shannon, fill = Cleansing_Freq)) +
geom_boxplot() +
stat_compare_means(label = "p.signif") +
geom_point()+
labs(x = " ",
y = "Shannon Diversity") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(family = "Arial", size = 12),
axis.text.y = element_text(family = "Arial", size = 12),
legend.text = element_text(family = "Arial", size = 10))

alpha.table %>%
filter(Area_Cond == "Afflicted") %>%
mutate(Type = ifelse(Hair_Type == "Natural", "Natural", "Other")) %>%
filter(Type != is.na(Type)) %>%
ggplot(aes(x = Type, y = diversity_shannon, fill = Type)) +
geom_boxplot() +
stat_compare_means(label = "p.signif") +
geom_point()+
labs(x = " ",
y = "Shannon Diversity") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(family = "Arial", size = 12),
axis.text.y = element_text(family = "Arial", size = 12),
legend.text = element_text(family = "Arial", size = 10))

alpha.table %>%
filter(Area_Cond == "Afflicted") %>%
ggplot(aes(x = Hair_Type, y = diversity_shannon, fill = Hair_Type)) +
geom_boxplot() +
stat_compare_means(label = "p.signif") +
geom_point()+
labs(x = " ",
y = "Shannon Diversity") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(family = "Arial", size = 12),
axis.text.y = element_text(family = "Arial", size = 12),
legend.text = element_text(family = "Arial", size = 10))

alpha.table %>%
filter(Area_Cond == "Afflicted") %>%
ggplot(aes(x = Gender, y = diversity_shannon, fill = Gender)) +
geom_boxplot() +
stat_compare_means(label = "p.signif") +
geom_point()+
labs(x = " ",
y = "Shannon Diversity") +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(family = "Arial", size = 12),
axis.text.y = element_text(family = "Arial", size = 12),
legend.text = element_text(family = "Arial", size = 10))

Beta Diversity
#CLR before Euclidean distance for Aitchison distances
#Center log ratio the phyloseq file
alo_clr <- microbiome::transform(ps_data, "clr")
#PCA via phyloseq
ord_clr <- phyloseq::ordinate(alo_clr, "RDA")
#ord_clr <- phyloseq::ordinate(alo_clr, "PCoA")
plot_ordination(alo_clr, ord_clr, color = "Area_Cond")

#Plot scree plot
phyloseq::plot_scree(ord_clr) +
geom_bar(stat="identity", fill = "blue") +
labs(x = "Axis", y = "Proportion of Variance")

#Examine eigenvalues and % proportion of variance explained
head(ord_clr$CA$eig)
## PC1 PC2 PC3 PC4 PC5 PC6
## 201.42564 123.57687 109.34320 100.35991 86.12584 79.91789
#What proportion of variance explained in actual understandable form as a decimal
sapply(ord_clr$CA$eig[1:10], function(x) x / sum(ord_clr$CA$eig))
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## 0.08508721 0.05220195 0.04618930 0.04239453 0.03638170 0.03375931 0.03193174
## PC8 PC9 PC10
## 0.03022906 0.02759772 0.02710738
clr1 <- ord_clr$CA$eig[1] / sum(ord_clr$CA$eig)
clr2 <- ord_clr$CA$eig[2] / sum(ord_clr$CA$eig)
phyloseq::plot_ordination(ps_data, ord_clr, type="samples", color="Area_Cond")+
geom_point() +
coord_fixed(clr2 / clr1) +
stat_ellipse(aes(group = Area_Cond), linetype = 2)+
scale_color_manual(values = c("red", "black"))+
theme(legend.title = element_blank(), legend.position = "none")+
labs(title = "C")

ggsave("figures/Fig1c.pdf", width = 4, height = 6)
ord <- ordinate(alo_clr, method = "PCoA", distance = "euclidean")
# Extract ordination coordinates and merge with metadata
ord_df <- plot_ordination(alo_clr, ord, type = "samples", justDF = TRUE)
# Add metadata: Site condition and Person_ID
ord_df$Area_Cond <- sample_data(alo_clr)$Area_Cond
ord_df$Person_ID <- sample_data(alo_clr)$Person_ID
# Plot with lines connecting samples from the same individual
ggplot(ord_df, aes(x = Axis.1, y = Axis.2, color = Area_Cond)) +
geom_point(size = 3) +
geom_line(aes(group = Person_ID), color = "gray70", linewidth = 0.5) +
theme_minimal() +
labs(title = "PCoA (CLR + Euclidean): Afflicted vs. Normal Sites",
x = "PCoA Axis 1", y = "PCoA Axis 2",
color = "Site Type") +
theme(legend.position = "right")

#Generate distance matrix
clr_dist_matrix <- phyloseq::distance(alo_clr, method = "euclidean")
#ADONIS test for Age -> Also significant!
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Age_Cat2, permutations = 10000)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 10000
##
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Age_Cat2, permutations = 10000)
## Df SumOfSqs R2 F Pr(>F)
## Model 2 8106 0.05188 1.7511 9.999e-05 ***
## Residual 64 148135 0.94812
## Total 66 156241 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ADONIS test for Gender -> Also significant!
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Gender, permutations = 10000)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 10000
##
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Gender, permutations = 10000)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 4663 0.02985 1.9998 3e-04 ***
## Residual 65 151577 0.97015
## Total 66 156241 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ADONIS test for Normal vs Afflicted
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000, strata=sample_df$Person_ID)
## Permutation test for adonis under reduced model
## Blocks: strata
## Permutation: free
## Number of permutations: 10000
##
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000, strata = sample_df$Person_ID)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 2954 0.0189 1.2524 9.999e-05 ***
## Residual 65 153287 0.9811
## Total 66 156241 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ADONIS test for Normal vs Afflicted -> Significant
vegan::adonis2(clr_dist_matrix ~phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 10000
##
## vegan::adonis2(formula = clr_dist_matrix ~ phyloseq::sample_data(alo_clr)$Area_Cond, permutations = 10000)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 2954 0.0189 1.2524 0.0491 *
## Residual 65 153287 0.9811
## Total 66 156241 1.0000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dispersion test for Area_Cond (Normal vs Afflicted)
dispr <- vegan::betadisper(clr_dist_matrix, phyloseq::sample_data(alo_clr)$Area_Cond)
anova(dispr)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 1 1159.1 1159.13 8.0916 0.00594 **
## Residuals 65 9311.3 143.25
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dispersion test for age
dispr2 <- vegan::betadisper(clr_dist_matrix, phyloseq::sample_data(alo_clr)$Age_Cat2)
anova(dispr2)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 2 2926.4 1463.22 12.332 2.95e-05 ***
## Residuals 64 7593.8 118.65
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Dispersion test for Gender
dispr3 <- vegan::betadisper(clr_dist_matrix, phyloseq::sample_data(alo_clr)$Gender)
anova(dispr3)
## Analysis of Variance Table
##
## Response: Distances
## Df Sum Sq Mean Sq F value Pr(>F)
## Groups 1 263.7 263.69 1.6132 0.2086
## Residuals 65 10625.2 163.47
#Dispersion plot for Area_Cond + boxplot
plot(dispr, main = "Ordination Centroids and Dispersion Labeled: Aitchison Distance", sub = "")

boxplot(dispr, main = "", xlab = "")

#Dispersion plot for Age_Cat + boxplot
plot(dispr2, main = "Ordination Centroids and Dispersion Labeled:Aitchison Distance", sub = "")

boxplot(dispr2, main = "", xlab = "")

#Dispersion plot for Gender + boxplot
plot(dispr3, main = "Ordination Centroids and Dispersion Labeled:Aitchison Distance", sub = "")

boxplot(dispr3, main = "", xlab = "")

#Generate distance matrix
clr_dist_matrix <- phyloseq::distance(alo_clr, method = "euclidean")
# Distance matrix to long format
dist_long <- as.data.frame(as.table(as.matrix(clr_dist_matrix)))
colnames(dist_long) <- c("Sample1", "Sample2", "Distance")
# Add metadata
meta <- sample_df[, c("X.SampleID", "Person_ID", "Area_Cond")]
dist_long <- dist_long %>%
inner_join(meta, by = c("Sample1" = "X.SampleID")) %>%
inner_join(meta, by = c("Sample2" = "X.SampleID"),
suffix = c("_1", "_2"))
#keep a copy of the matrix
dist_long_1 <- dist_long
# Remove redundant/self comparisons
dist_long <- dist_long %>%
filter(Sample1 < Sample2)
# Define "Within" vs "Between"
dist_long <- dist_long %>%
mutate(Comparison = if_else(Person_ID_1 == Person_ID_2, "Within", "Between")) %>%
mutate(Condition = Area_Cond_1)
#keep a copy
dist_long_2 <-dist_long
ggplot(dist_long, aes(x = Comparison, y = Distance, fill = Condition)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
#facet_wrap(~ Condition) +
labs(title = "Within vs Between-Person Distances by Condition",
y = "Distance", x = NULL) +
theme_minimal() +
scale_fill_manual(values = c("black", "red"))+
stat_compare_means(method = "wilcox")+
theme(legend.title = element_blank())

ggsave("figures/varianceWcondition.pdf")
## Saving 7 x 5 in image
ggplot(dist_long, aes(x = Comparison, y = Distance)) +
geom_boxplot(outlier.shape = NA, alpha = 0.7) +
#facet_wrap(~ Condition) +
labs(title = "Within vs Between-Person Distances",
y = "Distance", x = NULL) +
theme_minimal() +
stat_compare_means(method = "wilcox")

ggsave("figures/variance.pdf")
## Saving 7 x 5 in image
Dysbiosis Scores
#center log ratio (clr) transformation
clr <- microbiome::transform(ps_data, "clr")
#make a distance matrix
dist.matrix <- phyloseq::distance(clr, "euclidean") #euclidean distances of clr transformed data are Aitchison distances
#set Normal as the baseline
ref.samples <- sample_names(subset_samples(ps_data,
Area_Cond == "Normal"))
#calculate dysbiosis scores
dysbiosis <- euclideanDistCentroids(ps_data,
dist_mat = dist.matrix,
use_squared = F,
group_col = "Area_Cond",
control_label = "Normal",
case_label = "Afflicted")
# order the data
dysbiosis$Area_Cond <- factor(dysbiosis$Area_Cond,
levels = c("Normal", "Afflicted"))
#AOC plot to check sensitivity and specificity of the model
AUCplot <- pROC::roc(as.factor(dysbiosis$Area_Cond),
dysbiosis$CentroidDist_score,
#direction= ">",
plot=TRUE,
ci = TRUE,
auc.polygon=TRUE,
max.auc.polygon=TRUE,
print.auc=TRUE)
## Setting levels: control = Normal, case = Afflicted
## Setting direction: controls < cases

#calculate average dysbiosis score by group
dysbiosis %>%
group_by(Area_Cond) %>%
summarise(mean(CentroidDist_score))
## # A tibble: 2 × 2
## Area_Cond `mean(CentroidDist_score)`
## <fct> <dbl>
## 1 Normal -2.39
## 2 Afflicted 1.24
#check to see if there is a signicant different in dysbiosis scores between groups
ggplot(dysbiosis, aes(Area_Cond, CentroidDist_score)) +
geom_boxplot()+
geom_point()+
stat_compare_means(method = "wilcox.test")+
theme_bw()

#set a dysbiosis threshold
dysbiosis <- dysbiosis %>%
mutate(dysbiotic = case_when(CentroidDist_score > 0 ~ "Dysbiotic",
CentroidDist_score < 0 ~ "Normobiosis" ))
#check which hair sites got dysbiosis assignments
dysbiosis %>%
group_by(dysbiotic, Area_Cond) %>%
count(dysbiotic) %>%
arrange(desc(n))
## # A tibble: 4 × 3
## # Groups: dysbiotic, Area_Cond [4]
## dysbiotic Area_Cond n
## <chr> <fct> <int>
## 1 Normobiosis Normal 30
## 2 Dysbiotic Afflicted 24
## 3 Normobiosis Afflicted 11
## 4 Dysbiotic Normal 2
plotDysbiosisGradient(df=dysbiosis,
score="CentroidDist_score",
high_line = 0.0,
group_var = "Area_Cond",
group_colors=c("Normal" = "black",
"Afflicted"= "brown3"),
bg_colors = rev(brewer.pal(9, "Reds")),
jitter_width = 0.1) +
labs(y="Dysbiosis Score", subtitle = " ") +
theme(legend.title = element_blank())+
# adjust the x and y values to fit to plot
ggplot2::annotate("text", x = 0.65, y = .1, label = "Center",color="white")

ggsave("Figures/Fig2.pdf", height =6, width =5)
plotDysbiosisGradient(df = dysbiosis,
score = "CentroidDist_score",
high_line = 0.0,
group_var = "Area_Cond",
group_colors = c("Normal" = "black", "Afflicted" = "brown3"),
bg_colors = rev(brewer.pal(9, "Reds")),
jitter_width = 0) +
geom_text(data = dysbiosis,
aes(y = CentroidDist_score,
x = 1, # or adjust to match the y-location of points
label = Person_ID),
#position = position_jitter(width = 0.1, height = 0),
size = 3,
vjust = -1,
check_overlap = TRUE) +
labs(y = "Dysbiosis Score", subtitle = " ") +
theme(legend.title = element_blank()) +
annotate("text", x = 0.65, y = 0.1, label = "Center", color = "white")

#ggsave("figures/labeleddysbiois.pdf")
Mixed Effects Model
ps_data@sam_data$Area_Cond <- as.factor(ps_data@sam_data$Area_Cond)
ps_data@sam_data$Area_Cond <- relevel(ps_data@sam_data$Area_Cond, ref = "Normal")
levels(ps_data@sam_data$Area_Cond)
## [1] "Normal" "Afflicted"
# Aggregate Table by OTU
df.agg <- stats::aggregate(Abundance ~ OTU + X.SampleID, ps_data.df, FUN = sum)
# Make Blank Cells Unspecified
df.agg$OTU[df.agg$OTU == ""] <- "Unassigned"
# Manipulate Data Table to Short Version
df.agg.otus <- as.data.frame(reshape::cast(df.agg, X.SampleID ~ OTU, sum))
## Using Abundance as value column. Use the value argument to cast to override this choice
data <- df.agg.otus #preserve data before making a matrix
# Make SampleID Rownames and Remove Column
rownames(df.agg.otus) <- df.agg.otus$X.SampleID
df.agg.otus$X.SampleID <- NULL # to make a matrix
# Check Dimensions
dim(df.agg.otus)
### Create Variables for Model ###
# Sample Data
df.meta <- sample_data(ps_data)
Age <- df.meta$Age
Area_Cond <- df.meta$Area_Cond
# create data frame containing all the variables of interest
df.meta <- as_data_frame(df.meta)
df.meta.var <- df.meta %>% dplyr::select(X.SampleID, Person_ID, Age, Age_Cat, Gender)
data.var <- left_join(data, df.meta, by = "X.SampleID" )
#mem <- NBZIMM::mms(y = df.agg.otus, fixed = ~ Area_Cond, data = data.var, random = ~ 1 | Age, min.p = 0.2, method = "nb")
#saveRDS(mem, "output/mem")
mem <- readRDS("output/mem")
# Extract data
out <- mem$call$fixed
out = NBZIMM::fixed(mem)$dist
out = out[out[,2]!="(Intercept)", ]
res = out[, 3:5]
#remove the Area_CondAfflicted from the rowname
rownames(res) <- gsub("--Area_CondAfflicted", "", rownames(res))
# make a data frame
res.data <- as.data.frame(res)
#add OTU column to the data frame
res.data$OTU <- rownames(res.data)
#filter sigificant results
res.data <- res.data %>%
filter(pvalue <0.05)
#join results with all data
res.data.df <- left_join(res.data, ps_data.df, by = "OTU")
#clean up genus column
res.data.df$Genus <- sub("g__", "", res.data.df$Genus)
#number of OTUs that were more abundant in afflicted sites
res.data %>% filter(Estimate > 0)
## Estimate Std.Error pvalue
## 1000547 0.9123754 0.3727216 1.873928e-02
## 1001007 1.0322221 0.3951044 1.250536e-02
## 1015518 1.2098636 0.4374904 8.482653e-03
## 103411 1.3047249 0.2841304 4.122552e-05
## 1062356 0.9123062 0.3466475 1.191626e-02
## 1065569 0.8742012 0.3266396 1.065055e-02
## 1079708 1.2304012 0.3526376 1.172251e-03
## 109591 1.3050612 0.3301427 2.980036e-04
## 1096610 0.9777314 0.2428713 2.390798e-04
## 130864 2.5411772 0.2451496 5.057877e-13
## 134726 0.9503889 0.3709128 1.416904e-02
## 137183 1.7767134 0.4684931 4.823857e-04
## 159711 2.5480086 0.3672748 2.018415e-08
## 161677 3.4444797 0.8135205 1.262391e-04
## 1646259 0.9771663 0.3179838 3.757922e-03
## 1790396 0.9031755 0.3597093 1.608074e-02
## 2119418 1.2021677 0.3029106 2.841902e-04
## 224222 1.4834588 0.5373076 8.582639e-03
## 245648 1.0903653 0.3840787 7.011469e-03
## 274365 1.4454274 0.7082210 4.773126e-02
## 2751958 1.8342521 0.6823155 1.033336e-02
## 28246 1.0241342 0.4266782 2.100792e-02
## 30062 2.0953411 0.4024426 5.757770e-06
## 3225199 0.8084541 0.2856933 7.180177e-03
## 3359884 0.9676768 0.2276376 1.198082e-04
## 3384047 0.6420287 0.2725796 2.336936e-02
## 3600504 1.9216245 0.5612035 1.412402e-03
## 4128270 0.8545624 0.3860617 3.248289e-02
## 4296424 0.6971983 0.3342814 4.327394e-02
## 4297695 0.9268213 0.3759882 1.797814e-02
## 4302049 0.7082492 0.3422008 4.482184e-02
## 4305837 1.0592354 0.5027460 4.128867e-02
## 4306048 1.0845270 0.3107363 1.168727e-03
## 4307484 0.9331459 0.2913348 2.630740e-03
## 4309301 1.1485033 0.2941490 3.449434e-04
## 4309764 1.2948719 0.3764286 1.350158e-03
## 4311939 1.5500920 0.4906645 2.969187e-03
## 4315974 0.8492640 0.3077919 8.620200e-03
## 4318672 0.7210365 0.2949983 1.890577e-02
## 4321559 0.9719827 0.4615224 4.136781e-02
## 4337755 1.4453299 0.4121670 1.114546e-03
## 4349519 1.5107386 0.3988903 4.896826e-04
## 4353264 1.4938954 0.3920611 4.572572e-04
## 4361528 1.2858893 0.3858308 1.830275e-03
## 4364813 0.7985477 0.3281459 1.939875e-02
## 4404220 0.8296380 0.3661475 2.880624e-02
## 4418009 1.5682956 0.3432408 4.431877e-05
## 4425214 1.0297908 0.2504964 1.843111e-04
## 4427114 1.4514783 0.5174584 7.658180e-03
## 4428042 0.9148156 0.3181235 6.367975e-03
## 4439603 0.6946130 0.2576785 1.014145e-02
## 4446902 0.9824038 0.3226795 4.059508e-03
## 4449458 2.7875026 0.6809617 1.944596e-04
## 4449609 2.2084170 0.8799984 1.613113e-02
## 4453501 2.0146149 0.3748345 3.336690e-06
## 4455767 1.6624036 0.3885685 1.100881e-04
## 4466006 1.3448948 0.3793517 9.969286e-04
## 4469359 1.8910959 0.5558044 1.502378e-03
## 4471251 1.8759443 0.5563844 1.639662e-03
## 4476950 1.9553430 0.5083117 4.101906e-04
## 4483015 1.7043849 0.3903989 8.387549e-05
## 494906 0.7934991 0.3766175 4.128775e-02
## 504674 1.9218533 0.8540771 2.986485e-02
## 544480 1.3617864 0.4154170 2.134162e-03
## 545299 0.6579111 0.3218337 4.738608e-02
## 64653 2.2775105 0.5710855 2.680467e-04
## 710275 1.1140223 0.3261624 1.447357e-03
## 711275 1.4098026 0.3000002 2.933922e-05
## 753638 1.0641256 0.3347341 2.811115e-03
## 823916 0.9387880 0.3245235 6.086809e-03
## 859700 1.3725355 0.3945559 1.208025e-03
## 863124 1.5220274 0.4191908 7.769671e-04
## 864465 0.6446034 0.3063618 4.154948e-02
## 886735 0.8147830 0.4010442 4.870373e-02
## 900973 1.2470355 0.1959480 1.312688e-07
## 907241 2.1813456 0.6587331 1.943737e-03
## 912997 0.9621059 0.3876269 1.725108e-02
## 913149 0.3822520 0.1887448 4.938733e-02
## 933546 1.1619670 0.3162489 6.842268e-04
## 963344 2.1799632 0.5169328 1.330143e-04
## 964220 0.9964333 0.3748705 1.115396e-02
## New.CleanUp.ReferenceOTU7806 1.4892082 0.3994054 5.829405e-04
## New.CleanUp.ReferenceOTU955 1.0747083 0.4031269 1.093489e-02
## OTU
## 1000547 1000547
## 1001007 1001007
## 1015518 1015518
## 103411 103411
## 1062356 1062356
## 1065569 1065569
## 1079708 1079708
## 109591 109591
## 1096610 1096610
## 130864 130864
## 134726 134726
## 137183 137183
## 159711 159711
## 161677 161677
## 1646259 1646259
## 1790396 1790396
## 2119418 2119418
## 224222 224222
## 245648 245648
## 274365 274365
## 2751958 2751958
## 28246 28246
## 30062 30062
## 3225199 3225199
## 3359884 3359884
## 3384047 3384047
## 3600504 3600504
## 4128270 4128270
## 4296424 4296424
## 4297695 4297695
## 4302049 4302049
## 4305837 4305837
## 4306048 4306048
## 4307484 4307484
## 4309301 4309301
## 4309764 4309764
## 4311939 4311939
## 4315974 4315974
## 4318672 4318672
## 4321559 4321559
## 4337755 4337755
## 4349519 4349519
## 4353264 4353264
## 4361528 4361528
## 4364813 4364813
## 4404220 4404220
## 4418009 4418009
## 4425214 4425214
## 4427114 4427114
## 4428042 4428042
## 4439603 4439603
## 4446902 4446902
## 4449458 4449458
## 4449609 4449609
## 4453501 4453501
## 4455767 4455767
## 4466006 4466006
## 4469359 4469359
## 4471251 4471251
## 4476950 4476950
## 4483015 4483015
## 494906 494906
## 504674 504674
## 544480 544480
## 545299 545299
## 64653 64653
## 710275 710275
## 711275 711275
## 753638 753638
## 823916 823916
## 859700 859700
## 863124 863124
## 864465 864465
## 886735 886735
## 900973 900973
## 907241 907241
## 912997 912997
## 913149 913149
## 933546 933546
## 963344 963344
## 964220 964220
## New.CleanUp.ReferenceOTU7806 New.CleanUp.ReferenceOTU7806
## New.CleanUp.ReferenceOTU955 New.CleanUp.ReferenceOTU955
#number of unique genera that were differentially abundant
unique(res.data.df$Genus)
## [1] "Streptococcus" "Staphylococcus" "Corynebacterium"
## [4] "Acinetobacter" NA "Finegoldia"
## [7] "Lactobacillus" "" "Anaerococcus"
## [10] "Paracoccus" "Bacteroides" "Pseudomonas"
## [13] "Actinomyces" "Campylobacter" "Chryseobacterium"
## [16] "Rothia" "Neisseria" "Porphyromonas"
## [19] "Gemella" "Actinobacillus" "Sphingomonas"
## [22] "Veillonella" "Haemophilus" "Granulicatella"
## [25] "Peptoniphilus" "Fusobacterium" "Dialister"
## [28] "Enhydrobacter" "Alloiococcus" "Propionibacterium"
#count the Genus assignments of those OTUs
res.data.df %>%
distinct(OTU, .keep_all=TRUE) %>%
filter(Genus != is.na(Genus)) %>%
filter(Genus != "") %>%
group_by(Genus) %>%
count() %>%
arrange(desc(n))
## # A tibble: 28 × 2
## # Groups: Genus [28]
## Genus n
## <chr> <int>
## 1 Corynebacterium 18
## 2 Streptococcus 17
## 3 Acinetobacter 8
## 4 Anaerococcus 8
## 5 Staphylococcus 7
## 6 Enhydrobacter 3
## 7 Haemophilus 2
## 8 Lactobacillus 2
## 9 Rothia 2
## 10 Actinobacillus 1
## # ℹ 18 more rows
res.data.df %>%
filter(pvalue < 0.05) %>%
filter(Genus != "Unassigned") %>%
filter(Genus != "") %>%
mutate(Genus1 = reorder(Genus, Estimate)) %>%
ggplot(aes(x = Genus1, y = Estimate)) +
geom_point(alpha = 0.5) +
geom_errorbar(aes(ymin=Estimate-Std.Error, ymax=Estimate+Std.Error), width=.1, alpha = 0.5) +
coord_flip() +
labs(y = "Estimate", x = " ")+
geom_hline(yintercept = 0, lty = 2) +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 12))

ggsave("figures/Fig4.pdf")
## Saving 7 x 5 in image
res.data.df %>%
filter(pvalue <0.05) %>%
filter(Species != "Unassigned") %>% #remove any OTUs that don't have assigned species
filter(Species != "s__") %>%
mutate(Species1 = reorder(Species, Estimate)) %>%
ggplot(aes(x = Species1, y = Estimate, color = Genus)) +
geom_point(alpha = 0.5)+
#geom_jitter()+
geom_errorbar(aes(ymin=Estimate-Std.Error,
ymax=Estimate+Std.Error), width=.1, alpha = 0.5) +
coord_flip() +
geom_hline(yintercept = 0, lty = 2) +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y= element_blank(),
axis.text.x = element_text(family = "Arial", size = 10),
axis.text.y = element_text(family = "Arial", size = 10))

Random Forest with dysbiosis score
#set dysbiosis threshold and add column
dysbiosis <- dysbiosis %>%
mutate(dysbiotic = case_when(CentroidDist_score > 0 ~ "Dysbiotic",
CentroidDist_score < 0 ~ "Normobiosis" ))
#add dysbiosis score and assignments to the sample data
ps_data@sam_data$score <- dysbiosis$dysbiotic
ps_data@sam_data$numericScore <- dysbiosis$CentroidDist_score
#Prep data
predict <- t(otu_table(ps_data))
#Check dimensions
dim(predict)
## [1] 67 6009
#Create response variable
res <- as.factor(sample_data(ps_data)$score)
#Combine predict variable and response variable into one dataframe
machine.data <- data.frame(res, predict)
#check dimensions
dim(machine.data)
## [1] 67 6010
#
#class <- randomForest(res ~ ., data = machine.data, ntree = 1001, importance = TRUE, nodesize = 1)
#saveRDS(class, "output/RF")
class <- readRDS("output/RF") #by default a random forest will be different every time you run it, so here we're loading saved RF previously made
print(class)
##
## Call:
## randomForest(formula = res ~ ., data = machine.data, ntree = 1001, importance = TRUE, nodesize = 1)
## Type of random forest: classification
## Number of trees: 1001
## No. of variables tried at each split: 77
##
## OOB estimate of error rate: 20.9%
## Confusion matrix:
## Dysbiotic Normobiosis class.error
## Dysbiotic 15 11 0.42307692
## Normobiosis 3 38 0.07317073
confusiondf <- as.data.frame(class$confusion)
confusionfixed <- data.frame(Actual = c("Dysbiotic", "Dysbiotic", "Normobiosis", "Normobiosis"), Predicted = c("Dysbiotic", "Normobiosis", "Dysbiotic", "Normobiosis"), Count = c(confusiondf[1,1], confusiondf[1,2], confusiondf[2,1], confusiondf[2,2]), Freq = c(confusiondf[1,3], confusiondf[1,3], confusiondf[2,3], confusiondf[2,3]))
ggplot(confusionfixed, aes(x = Predicted, y = Actual, fill = Count)) +
geom_tile()+
geom_text(aes(label = Count), fontface="bold", color = "white") +
geom_text(aes(label = Freq, vjust = 3), fontface="bold") +
theme_bw() +
theme(legend.position="none") +
labs(x = "Predicted", y = "Actual")

#extract the predictors from the RF
imp.taxa <- randomForest::importance(class)
#make a dataframe
imp.taxa <- data.frame(predictors = rownames(imp.taxa), imp.taxa)
#rename the object for easier coding
imp <- imp.taxa
# Order the predictor levels by importance (Mean decrease accuracy)
imp.sort <- arrange(imp, desc(MeanDecreaseAccuracy))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)
# Select the top 100 predictors
imp.100 <- imp.sort[1:100, ]
# Change Column name to say "OTUID" rather than "predictors"
colnames(imp.100)[which(names(imp.100) == "predictors")] <- "OTUID"
# Remove "X" from OTUID column
imp.100$OTUID <- gsub("X", "", paste(imp.100$OTUID))
# Make Taxa table from phyloseq object a data frame
otu_df <- as.data.frame(tax_table(ps_data))
# Make OTU IDs (row names) into column
otu_df$OTUID <- rownames(otu_df)
# Merge two data frames using matched column so that we know which taxa are assigned to the OTUs
imp100.merged <- merge(imp.100, otu_df, by = "OTUID")
#rename the object
RFtop100_Otus <- imp100.merged
#count the number of unique genera
unique(RFtop100_Otus$Genus)
## [1] "g__Streptococcus" "g__Corynebacterium" "g__Veillonella"
## [4] "g__Anaerococcus" NA "g__Staphylococcus"
## [7] "g__Pseudomonas" "g__Finegoldia" "g__Peptoniphilus"
## [10] "g__Carnobacterium" "g__" "g__Actinomyces"
## [13] "g__Acinetobacter" "g__[Prevotella]" "g__Blautia"
## [16] "g__Haemophilus" "g__Rothia" "g__Mycobacterium"
## [19] "g__Gemella" "g__Actinobacillus" "g__Micrococcus"
## [22] "g__Aggregatibacter" "g__Propionibacterium" "g__Granulicatella"
## [25] "g__Megasphaera" "g__Paracoccus" "g__Bifidobacterium"
## [28] "g__Dialister" "g__Alloiococcus"
#cleanup Genus names
imp100.merged$Genus <- sub("g__", "", imp100.merged$Genus)
#plot
imp100.merged %>%
mutate(Genus = reorder(Genus, MeanDecreaseAccuracy)) %>%
filter(Genus != "") %>%
filter(Genus != is.na(Genus)) %>%
ggplot(aes(x = reorder(Genus, MeanDecreaseAccuracy, sum),
y = MeanDecreaseAccuracy,
fill = OTUID)) + # Stacked by OTU
geom_bar(stat = "identity", color = "black") +
coord_flip() +
labs(x = "Top taxa for classifying dysbiotic scalp sites", y = "Mean Decrease Accuracy (%)")+
theme(legend.position = "none")

ggsave("figures/Fig3.pdf", height = 5, width =6)
Compare Important Taxa
#extract diff abundant taxa from the mixed effected model
res.data <- res.data %>% filter(pvalue <0.05)
#check the intersection of OTUs between the random forest and mixed effects models
shared_sig.OTU <- intersect(res.data$OTU, RFtop100_Otus$OTU)
shared_sig.OTU #42 shared otus
## [1] "1000547" "1015518"
## [3] "1062356" "1065569"
## [5] "1079708" "109591"
## [7] "1096610" "161677"
## [9] "30062" "3225199"
## [11] "3384047" "4296424"
## [13] "4302049" "4306048"
## [15] "4307484" "4309301"
## [17] "4309764" "4311939"
## [19] "4315974" "4337755"
## [21] "4349519" "4404220"
## [23] "4425214" "4439603"
## [25] "4446902" "4453501"
## [27] "4455767" "4466006"
## [29] "4469359" "4471251"
## [31] "4476950" "4483015"
## [33] "494906" "544480"
## [35] "711275" "753638"
## [37] "859700" "863124"
## [39] "886735" "912997"
## [41] "New.CleanUp.ReferenceOTU7806" "New.ReferenceOTU11"
#make a new data frame keeping only OTUs sig in both models
OTU.filt.df <- ps_data.df %>% filter(OTU %in% shared_sig.OTU)
#repeat for the relative abundance data frame
OTU.filt_rel.df <- ps_rel.df%>% filter(OTU %in% shared_sig.OTU)
#add a variable to each set of results indicating which test they came from
res.data$test <- "Mixed Effects Model"
RFtop100_Otus$test <- "Random Forest Classifer"
#change the column name so they match
RFtop100_Otus$OTU <- RFtop100_Otus$OTUID
#create a shared data frame
shared <- full_join(res.data, RFtop100_Otus, by = c("OTU", "test"))
#keep only OTUs that are in both models
keep <- union(res.data$OTU, RFtop100_Otus$OTU)
shared <- shared %>% filter(OTU %in% keep)
#make a list with the two tests seperated
data_list <- split(shared$OTU, shared$test)
#make a venn diagram
ggvenn(data_list, label_sep = "\n", fill_color = c("white", "white"), show_elements = F)

Confirm differentially abundant OTUs
#transform to log10 abundances
OTU.filt.df$log10Abund <- log10( 1 + OTU.filt.df$Abundance)
# Run Wilcoxon test for each OTU and store p-values
otu_pvals_notsig <- OTU.filt.df %>%
group_by(OTU) %>%
summarise(p_value = wilcox.test(log10Abund ~ Area_Cond)$p.value) %>%
filter(p_value > 0.05)
## Warning: There were 42 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `p_value = wilcox.test(log10Abund ~ Area_Cond)$p.value`.
## ℹ In group 1: `OTU = "1000547"`.
## Caused by warning in `wilcox.test.default()`:
## ! cannot compute exact p-value with ties
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 41 remaining warnings.
#Extract OTU names from the data frame of shared OTUs by both RF and Mixed effects model
OTUs <- sort(unique(OTU.filt.df$OTU))
#OTUs to remove
remove <- otu_pvals_notsig$OTU
#Significant OTUs
OTUs_sig <- OTUs[!OTUs %in% remove]
#keep only significant OTUs
OTU.filt.df <- OTU.filt.df %>%
filter(OTU %in% OTUs_sig)
OTU.filt_rel.df <- OTU.filt_rel.df%>%
filter(OTU %in% OTUs_sig)
sort(unique(OTU.filt_rel.df$Genus))
## [1] "" "Acinetobacter" "Anaerococcus" "Corynebacterium"
## [5] "Finegoldia" "Gemella" "Rothia" "Streptococcus"
sort(unique(OTU.filt.df$Genus))
## [1] "" "Acinetobacter" "Anaerococcus" "Corynebacterium"
## [5] "Finegoldia" "Gemella" "Rothia" "Streptococcus"
OTU.filt_rel.df %>%
filter(Genus != "") %>%
group_by(Genus) %>% # Group by Genus before reordering
mutate(OTU = factor(OTU, levels = unique(OTU[order(Abundance, decreasing = TRUE)]))) %>%
ggplot(aes(OTU, Abundance, fill = Area_Cond)) +
geom_boxplot(alpha =0.7, outliers = F) +
geom_point(aes(OTU, Abundance), position = position_dodge(width = 0.75), size = 0.3, alpha = 0.7) +
scale_y_log10(breaks = scales::breaks_log(n = 10),
labels = scales::number_format(accuracy = 0.0001)) +
coord_flip() +
labs(y= "Relative Abundance") +
scale_fill_manual(values = c("red", "black")) +
theme(legend.title = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.5),
axis.text.y = element_blank())+
facet_wrap(~Genus, ncol = 3, scales = "free_y")
## Warning in scale_y_log10(breaks = scales::breaks_log(n = 10), labels = scales::number_format(accuracy = 1e-04)): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Warning: Removed 680 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggsave("figures/Figure5.pdf")
## Saving 7 x 5 in image
## Warning in scale_y_log10(breaks = scales::breaks_log(n = 10), labels =
## scales::number_format(accuracy = 1e-04)): log-10 transformation introduced
## infinite values.
## Warning in scale_y_log10(breaks = scales::breaks_log(n = 10), labels =
## scales::number_format(accuracy = 1e-04)): log-10 transformation introduced
## infinite values.
## Warning: Removed 680 rows containing non-finite outside the scale range
## (`stat_boxplot()`).